virtualenv --no-download ~/.kb_env
source ~/.kb_env/bin/activate
pip install -U pip
pip install kb-python
pip install scanpy
pip install scvelo
pip install python-igraph
pip install louvain
pip install jupyterlab
wget ftp://ftp.ensembl.org/pub/release-98/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
wget ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz
wget ftp://ftp.ensembl.org/pub/release-98/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz
wget ftp://ftp.ensembl.org/pub/release-98/gtf/mus_musculus/Mus_musculus.GRCm38.98.gtf.gz
gunzip /path/to/reference/fasta
gunzip /path/to/reference/gtf
mkdir index; cd index #optional
kb ref -i index.idx -g transcripts_to_genes.txt -f1 cdna.fa -f2 intron.fa -c1 cdna_transcripts_to_capture.txt -c2 intron_transcripts_to_capture.txt --lamanno /path/to/reference/fasta /path/to/reference/gtf
linux
index/
├── cdna.fa
├── cdna_transcripts_to_capture.txt
├── index.idx
├── intron.fa
├── intron_transcripts_to_capture.txt
└── transcripts_to_genes.txt
INDEXDIR=/path/to/folder/with/index # where you saved the indexes in step 2.2
INDEX=${INDEXDIR}index.idx
TtG=${INDEXDIR}transcripts_to_genes.txt
TECH=DropSeq # 10Xv1, 10Xv2 and other technologies available - see http://pachterlab.github.io/kallisto//releases/2018/11/17/v0.45.0 for more
OUTDIR=/path/to/output_folder
cDNA_TtC=${INDEXDIR}cdna_transcripts_to_capture.txt
INTRON=${INDEXDIR}intron_transcripts_to_capture.txt
FASTQDIR=/path/to/fastq_dir
FASTQ=${FASTQDIR}/*
kb count --lamanno --verbose --loom -m 4G -i $INDEX -g $TtG -x $TECH -o $OUTDIR -c1 $cDNA_TtC -c2 $INTRON_TtC $FASTQ
$OUTDIR should look like this¶linux
$OUTDIR/
├── counts_unfilteredÂ
│  ├── adata.loom
│  ├── spliced
│  ├── spliced.barcodes.txt
│  ├── spliced.genes.txt
│  ├── spliced.mtx
│  ├── unspliced
│  ├── unspliced.barcodes.txt
│  ├── unspliced.genes.txt
│  └── unspliced.mtx
├── inspect.json
├── matrix.ec
├── output.bus
├── output.unfiltered.bus
├── run_info.json
├── spliced.unfiltered.bus
├── transcripts.txt
├── unspliced.unfiltered.bus
└── whitelist.txt
$OUTDIR/counts_unfiltered/adata.loom which contains the spliced and unspliced count matrices¶import loompy
import pandas as pd
lfile = loompy.connect("/path/to/loom/file.loom")
t2g = pd.read_csv("/path/to/index/transcripts_to_genes.txt", sep = "\t", header = None, names=["tid", "gid", "gene"])
gene_names = t2g[["gid","gene"]]
unique_names = gene_names.drop_duplicates()
unique_names.index = unique_names["gid"]
unique_names = unique_names.reindex(lfile.row_attrs["var_names"])
lfile.row_attrs["gene_names"] = unique_names["gene"].values
lfile.close()
gene_names slot, as seen below.¶wd = "/home/jhowa105/projects/def-jsjoyal/jhowa105/kb_tut" with the path to your data¶import numpy as np
import pandas as pd
import scanpy as sc
import os
import loompy
import scvelo as scv
wd = "/home/jhowa105/projects/def-jsjoyal/jhowa105/kb_tut" #replace
sc.logging.print_versions()
results_file = wd + "analysis.h5ad"
sc.settings.set_figure_params(dpi=100)
adata = sc.read_loom(filename = os.path.join(wd, "adata.loom"))
adata
sc.pl.highest_expr_genes(adata, n_top=20, gene_symbols="gene_names")
sc.pp.filter_cells(adata, min_genes=100)
sc.pp.filter_genes(adata, min_cells=10)
adata
mito_genes = adata.var["gene_names"].str.startswith('mt-') # capture mitochondrial genes
crystal_genes = adata.var["gene_names"].str.contains("^Cry[a-b]") # capture genes starting with Crya or Cryb - lense contamination
adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1) / np.sum(adata.X, axis=1) # add percent mito metadata
adata.obs['percent_crystal'] = np.sum(adata[:, crystal_genes].X, axis=1) / np.sum(adata.X, axis=1) # add percent Cry metadata
adata.obs['n_counts'] = adata.X.sum(axis=1) # add count depth metadata
sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito'], jitter=0.4, multi_panel=True)
sc.pl.scatter(adata, x='n_counts', y='n_genes', color = "percent_mito")
sc.pl.scatter(adata, x='n_counts', y='n_genes', color = "percent_crystal")
adata = adata[adata.obs['percent_mito'] < 0.1, :]
adata = adata[adata.obs['percent_crystal'] < 0.1, :]
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.log1p(adata)
adata.raw = adata
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata)
sc.pp.regress_out(adata, ['n_counts', 'percent_mito'])
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca(adata, color=['Rgs5','Neurod1','Ckb'], gene_symbols = "gene_names", use_raw=True)
sc.pl.pca_variance_ratio(adata, log=True)
adata.write(results_file)
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=15)
sc.tl.umap(adata)
sc.pl.umap(adata, color = ['n_counts','Rgs5','Neurod1','Ckb'], gene_symbols = "gene_names")
sc.tl.louvain(adata)
sc.pl.umap(adata, color=['louvain', 'Rgs5', 'Ckb'], wspace = 0.5, gene_symbols = "gene_names")
adata.write(results_file)
sc.tl.rank_genes_groups(adata, 'louvain', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, gene_symbols = "gene_names")
adata.write(results_file)
# get top markers as gene names, instead of gene ids
top_markers = [j for i in adata.uns["rank_genes_groups"]["names"].dtype.names for j in adata.uns["rank_genes_groups"]['names'][i][0:5]]
top_markers_genes = []
for gene_id in top_markers:
gene_name = adata.raw.var["gene_names"][gene_id]
if gene_name not in top_markers_genes:
top_markers_genes.append(gene_name)
sc.pl.dotplot(adata, top_markers_genes, groupby='louvain', gene_symbols = "gene_names")
sc.pl.umap(adata, color=['louvain', 'Cryab', 'Ckb', 'Sfrp1', 'Neurod4','Rpgrip1', 'Apoe', 'Pf4', 'Top2a', 'Col9a1', 'Cdkn1c', 'Tulp1', 'Rgs5', 'Tfap2b', 'Ptgds', 'Cd93', 'Hba-a2', 'Meg3'], wspace = 0.5, gene_symbols="gene_names")
scv.pp.moments(adata)
scv.tl.velocity(adata, mode='stochastic')
scv.tl.velocity_graph(adata)
scv.pl.velocity_embedding(adata, basis='umap')
scv.pl.velocity_embedding_stream(adata, basis='umap')
scv.pl.velocity_embedding_grid(adata, basis='umap')
scv.pl.velocity_graph(adata)
adata.write(results_file)